This project aims to perform an in-depth analysis of the company's employee data, which encompasses a wide range of information such as personal details, job metrics, performance ratings, and compensation figures. The objective is to uncover insights that can help in making data-driven decisions in areas like salary management, employee retention, performance evaluation, and workload distribution.
Throughout this analysis, we will:
By the end of this project, we hope to provide actionable recommendations that can enhance HR management and improve overall organizational efficiency.
import pandas as pd
import numpy as np
df=pd.read_excel('Employees.xlsx')
df
| No | First Name | Last Name | Gender | Start Date | Years | Department | Country | Center | Monthly Salary | Annual Salary | Job Rate | Sick Leaves | Unpaid Leaves | Overtime Hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Ghadir | Hmshw | Male | 2018-04-04 | 2 | Quality Control | Egypt | West | 1560 | 18720 | 3.0 | 1 | 0 | 183 |
| 1 | 2 | Omar | Hishan | Male | 2020-05-21 | 0 | Quality Control | Saudi Arabia | West | 3247 | 38964 | 1.0 | 0 | 5 | 198 |
| 2 | 3 | Ailya | Sharaf | Female | 2017-09-28 | 3 | Major Mfg Projects | Saudi Arabia | West | 2506 | 30072 | 2.0 | 0 | 3 | 192 |
| 3 | 4 | Lwiy | Qbany | Male | 2018-08-14 | 2 | Manufacturing | United Arab Emirates | Main | 1828 | 21936 | 3.0 | 0 | 0 | 7 |
| 4 | 5 | Ahmad | Bikri | Male | 2020-03-11 | 0 | Manufacturing | Egypt | Main | 970 | 11640 | 5.0 | 0 | 5 | 121 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 684 | 685 | Sari | Hanna | Male | 2020-05-26 | 0 | Marketing | Lebanon | South | 1452 | 17424 | 2.0 | 0 | 3 | 1 |
| 685 | 686 | Eubayda | Kayd | Male | 2020-06-03 | 0 | Facilities/Engineering | Egypt | North | 3237 | 38844 | 3.0 | 1 | 0 | 4 |
| 686 | 687 | Khalil | Alkalu | Male | 2017-07-11 | 3 | Facilities/Engineering | Egypt | North | 2819 | 33828 | 5.0 | 0 | 0 | 0 |
| 687 | 688 | Muhamad | Shrbjy | Male | 2018-05-30 | 2 | Creative | Egypt | North | 2069 | 24828 | 3.0 | 0 | 0 | 10 |
| 688 | 689 | Abd Albasit | AlAhmar | Male | 2020-08-05 | 0 | IT | United Arab Emirates | North | 2606 | 31272 | 5.0 | 0 | 0 | 0 |
689 rows × 15 columns
male=df[df['Gender']=="Male"]
female=df[df['Gender']=="Female"]
df.describe()
| No | Start Date | Years | Monthly Salary | Annual Salary | Job Rate | Sick Leaves | Unpaid Leaves | Overtime Hours | |
|---|---|---|---|---|---|---|---|---|---|
| count | 689.000000 | 689 | 689.000000 | 689.000000 | 689.000000 | 689.000000 | 689.000000 | 689.000000 | 689.000000 |
| mean | 345.000000 | 2019-01-25 09:20:06.966618368 | 1.476052 | 2068.201742 | 24818.420900 | 3.586357 | 1.609579 | 0.759071 | 13.702467 |
| min | 1.000000 | 2016-01-08 00:00:00 | 0.000000 | 703.000000 | 8436.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 173.000000 | 2018-04-05 00:00:00 | 1.000000 | 1436.000000 | 17232.000000 | 3.000000 | 0.000000 | 0.000000 | 3.000000 |
| 50% | 345.000000 | 2019-04-03 00:00:00 | 1.000000 | 2077.000000 | 24924.000000 | 3.000000 | 0.000000 | 0.000000 | 7.000000 |
| 75% | 517.000000 | 2019-12-22 00:00:00 | 2.000000 | 2682.000000 | 32184.000000 | 5.000000 | 3.000000 | 0.000000 | 10.000000 |
| max | 689.000000 | 2020-12-29 00:00:00 | 5.000000 | 3450.000000 | 41400.000000 | 5.000000 | 6.000000 | 6.000000 | 198.000000 |
| std | 199.041453 | NaN | 1.190963 | 763.289240 | 9159.470878 | 1.350125 | 2.196051 | 1.647764 | 25.692049 |
df.columns
Index(['No', 'First Name', 'Last Name', 'Gender', 'Start Date', 'Years',
'Department', 'Country', 'Center', 'Monthly Salary', 'Annual Salary',
'Job Rate', 'Sick Leaves', 'Unpaid Leaves', 'Overtime Hours'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 689 entries, 0 to 688 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 No 689 non-null int64 1 First Name 689 non-null object 2 Last Name 689 non-null object 3 Gender 689 non-null object 4 Start Date 689 non-null datetime64[ns] 5 Years 689 non-null int64 6 Department 689 non-null object 7 Country 689 non-null object 8 Center 689 non-null object 9 Monthly Salary 689 non-null int64 10 Annual Salary 689 non-null int64 11 Job Rate 689 non-null float64 12 Sick Leaves 689 non-null int64 13 Unpaid Leaves 689 non-null int64 14 Overtime Hours 689 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(7), object(6) memory usage: 80.9+ KB
maleavg=male['Monthly Salary'].mean()
femaleavg=female['Monthly Salary'].mean()
Male average:
maleavg
2073.0801781737196
Female average:
femaleavg
2059.075
Check for missing values
print(df.isnull().sum())
No 0 First Name 0 Last Name 0 Gender 0 Start Date 0 Years 0 Department 0 Country 0 Center 0 Monthly Salary 0 Annual Salary 0 Job Rate 0 Sick Leaves 0 Unpaid Leaves 0 Overtime Hours 0 dtype: int64
# Fill missing values only in numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
# Fill missing values in non-numeric columns with a placeholder (e.g., 'Unknown')
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
df[non_numeric_columns] = df[non_numeric_columns].fillna('Unknown')
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
sns.countplot(x='Department', data=df)
plt.title('Employee Distribution by Department')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Department', y='Annual Salary', data=df, estimator=np.mean)
plt.title('Average Annual Salary by Department')
plt.xticks(rotation=45)
plt.show()
# Use .loc to safely assign the mean value to missing entries
df.loc[:, 'Annual Salary'] = df['Annual Salary'].fillna(df['Annual Salary'].mean())
plt.figure(figsize=(10, 6))
sns.histplot(df['Years'], kde=True)
plt.title('Employee Years Distribution')
plt.xlabel('Years')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', data=df)
plt.title('Gender Distribution in the Company')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df['Annual Salary'], kde=True)
plt.title('Distribution of Annual Salary')
plt.xlabel('Annual Salary')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Department', y='Overtime Hours', data=df, estimator=np.mean)
plt.title('Average Overtime Hours by Department')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df['Job Rate'], kde=True)
plt.title('Distribution of Job Rate')
plt.xlabel('Job Rate')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Gender', y='Sick Leaves', data=df, estimator=np.mean)
plt.title('Average Sick Leaves by Gender')
plt.show()
Deparment employee ratio by each country
employee_count = df.groupby(['Country', 'Department']).size().unstack(fill_value=0)
# Plotting pie charts for each country
num_countries = len(employee_count.index)
fig, axes = plt.subplots(1, num_countries, figsize=(14, 8)) # Increased figure size
wedges = []
for i, country in enumerate(employee_count.index):
wedge, _, _ = axes[i].pie(employee_count.loc[country], autopct='%1.1f%%', startangle=140)
wedges.append(wedge)
axes[i].set_title(f'{country}', fontsize=14)
# Flatten the wedges list for the legend
wedges = [item for sublist in wedges for item in sublist]
# Adding a single legend for all pie charts
fig.legend(wedges, employee_count.columns, title="Departments", loc="lower center", ncol=3, fontsize=10)
plt.tight_layout(rect=[0, 0.1, 1, 0.95])
plt.show()
import plotly.express as px
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Employee Count']
# Plotting the map
fig = px.choropleth(
country_counts,
locations='Country',
locationmode='country names',
color='Employee Count',
color_continuous_scale='Viridis',
title='Distribution of Employees by Country'
)
# Show plot
fig.show()
df['Year'] = df['Start Date'].dt.year
salary_by_year = df.groupby('Year')['Annual Salary'].mean()
plt.figure(figsize=(12, 7))
plt.plot(salary_by_year.index, salary_by_year.values, marker='o', linestyle='-', color='b')
plt.xlabel('Year')
plt.ylabel('Average Annual Salary')
plt.title('Average Annual Salary Over Time')
plt.grid(True)
plt.show()